import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import plotly.express as px
import plotly
import plotly.graph_objs as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pylab
import math
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.stattools import durbin_watson
# pour afficher dynamiquement dans le notebook
from IPython.display import clear_output
from IPython.display import display, Math, Markdown
import time
from datetime import datetime
pd.set_option('display.max_columns', None)
echantillon = pd.read_csv("données/echantillon.csv")
echantillon = echantillon.drop(columns=['Unnamed: 0'])
data_ok = pd.read_csv("données/data_ok.csv")
data_ok = data_ok.drop(columns=['Unnamed: 0'])
gini_4 = pd.read_csv("données/gini_4.csv")
dist_revenus_ok = pd.read_csv("données/dist_revenus_ok.csv")
dist_revenus_ok = dist_revenus_ok.drop(columns=['Unnamed: 0'])
dist_revenus_ok = dist_revenus_ok.rename(columns = {"country" : "iso3"})
dist_rev_moy = dist_revenus_ok.groupby(by = "iso3").mean()
dist_rev_moy = dist_rev_moy.rename(columns = {"income": "income_mean"}).reset_index()
dist_rev_moy = dist_rev_moy[["iso3", "income_mean"]]
dist_revenus_ok = pd.merge(left = dist_rev_moy, right = dist_revenus_ok, on = "iso3")
dist_revenus_ok = dist_revenus_ok[["iso3", "income_mean", "quantile", "income"]]
dist_revenus_ok
| iso3 | income_mean | quantile | income | |
|---|---|---|---|---|
| 0 | ALB | 2994.829902 | 1 | 728.89795 |
| 1 | ALB | 2994.829902 | 2 | 916.66235 |
| 2 | ALB | 2994.829902 | 3 | 1010.91600 |
| 3 | ALB | 2994.829902 | 4 | 1086.90780 |
| 4 | ALB | 2994.829902 | 5 | 1132.69970 |
| ... | ... | ... | ... | ... |
| 11595 | ZAF | 5617.904880 | 96 | 24553.56800 |
| 11596 | ZAF | 5617.904880 | 97 | 28858.03100 |
| 11597 | ZAF | 5617.904880 | 98 | 35750.29000 |
| 11598 | ZAF | 5617.904880 | 99 | 46297.31600 |
| 11599 | ZAF | 5617.904880 | 100 | 82408.55000 |
11600 rows × 4 columns
#Merge du dataset
#dist_revenus_ok = dist_revenus_ok.rename(columns = {"country" : "iso3"})
données_completes = pd.merge(left = data_ok, right = dist_revenus_ok, on = "iso3", how = "outer")
données_completes = données_completes[données_completes['country'].notna()]
WID = données_completes[["country", "quantile", "income", "income_mean", "moy_gini", "elasticite_ok"]]
WID
| country | quantile | income | income_mean | moy_gini | elasticite_ok | |
|---|---|---|---|---|---|---|
| 0 | Bangladesh | 1 | 276.93503 | 998.692409 | 33.300000 | 0.538000 |
| 1 | Bangladesh | 2 | 332.78653 | 998.692409 | 33.300000 | 0.538000 |
| 2 | Bangladesh | 3 | 359.20620 | 998.692409 | 33.300000 | 0.538000 |
| 3 | Bangladesh | 4 | 377.34985 | 998.692409 | 33.300000 | 0.538000 |
| 4 | Bangladesh | 5 | 392.61746 | 998.692409 | 33.300000 | 0.538000 |
| ... | ... | ... | ... | ... | ... | ... |
| 10795 | United States | 96 | 69926.37000 | 25503.581661 | 40.563636 | 0.537666 |
| 10796 | United States | 97 | 77634.82000 | 25503.581661 | 40.563636 | 0.537666 |
| 10797 | United States | 98 | 88482.84000 | 25503.581661 | 40.563636 | 0.537666 |
| 10798 | United States | 99 | 106765.26000 | 25503.581661 | 40.563636 | 0.537666 |
| 10799 | United States | 100 | 176928.55000 | 25503.581661 | 40.563636 | 0.537666 |
10800 rows × 6 columns
WID.shape
(10800, 6)
len(WID['country'].unique())
108
print(WID.isnull().any())
country False quantile False income False income_mean False moy_gini False elasticite_ok False dtype: bool
print(WID.duplicated().any())
print(WID.duplicated().sum())
False 0
from collections import Counter
def generate_incomes(n, pj):
# On génère les revenus des parents (exprimés en logs) selon une loi normale.
# La moyenne et variance n'ont aucune incidence sur le résultat final (ie. sur le caclul de la classe de revenu)
ln_y_parent = st.norm(0,1).rvs(size=n)
# Génération d'une réalisation du terme d'erreur epsilon
residues = st.norm(0,1).rvs(size=n)
return np.exp(pj*ln_y_parent + residues), np.exp(ln_y_parent)
def quantiles(l, nb_quantiles):
size = len(l)
l_sorted = l.copy()
l_sorted = l_sorted.sort_values()
quantiles = np.round(np.arange(1, nb_quantiles+1, nb_quantiles/size) -0.5 +1./size)
q_dict = {a:int(b) for a,b in zip(l_sorted,quantiles)}
return pd.Series([q_dict[e] for e in l])
def compute_quantiles(y_child, y_parents, nb_quantiles):
y_child = pd.Series(y_child)
y_parents = pd.Series(y_parents)
c_i_child = quantiles(y_child, nb_quantiles)
c_i_parent = quantiles(y_parents, nb_quantiles)
sample = pd.concat([y_child, y_parents, c_i_child, c_i_parent], axis=1)
sample.columns = ["y_child", "y_parents", "c_i_child","c_i_parent"]
return sample
def distribution(counts, nb_quantiles):
distrib = []
total = counts["counts"].sum()
if total == 0 :
return [0] * nb_quantiles
for q_p in range(1, nb_quantiles+1):
subset = counts[counts.c_i_parent == q_p]
if len(subset):
nb = subset["counts"].values[0]
distrib += [nb / total]
else:
distrib += [0]
return distrib
def conditional_distributions(sample, nb_quantiles):
counts = sample.groupby(["c_i_child","c_i_parent"]).apply(len)
counts = counts.reset_index()
counts.columns = ["c_i_child","c_i_parent","counts"]
mat = []
for child_quantile in np.arange(nb_quantiles)+1:
subset = counts[counts.c_i_child == child_quantile]
mat += [distribution(subset, nb_quantiles)]
return np.array(mat)
def plot_conditional_distributions(p, cd, nb_quantiles):
#plt.figure(figsize=(10,10))
# La ligne suivante sert à afficher un graphique en "stack bars", sur ce modèle : https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html
cumul = np.array([0] * nb_quantiles)
for i, child_quantile in enumerate(cd):
plt.bar(np.arange(nb_quantiles)+1, child_quantile, bottom=cumul, width=0.95, label = str(i+1) +"e")
cumul = cumul + np.array(child_quantile)
plt.axis([.5, nb_quantiles*1.3 ,0 ,1])
plt.title("p=" + str(p))
plt.legend(ncol=1)
plt.xlabel("quantile parents")
plt.ylabel("probabilité du quantile enfant")
plt.show()
def proba_cond(c_i_parent, c_i_child, mat):
return mat[c_i_child, c_i_parent]
def smooth(x,y, box_percent=0.05,res=50,median=True):
surface = max(x)-min(x)
my_pas = np.arange(min(x),max(x),surface/res)
box = surface*box_percent
demi_box = box/2
y_sortie = np.array([])
x_sortie = np.array([])
for myx in my_pas :
temp = [y[i] for i in range(len(x)) if ((x[i]>=(myx-demi_box))and(x[i]<=(myx+demi_box)))]
if median==True :
temp_y = np.median(temp)
else :
temp_y = np.mean(temp)
#print(temp_y)
y_sortie = np.append(y_sortie,temp_y)
#print(y_sortie)
x_sortie = np.append(x_sortie,myx)
return x_sortie, y_sortie
%%time
#Question 1 à 2 :
pj = 0.9
nb_quantiles = 10
n = 1000*nb_quantiles
#Question 3 :
y_child, y_parents = generate_incomes(n, pj)
#Question 4 :
sample = compute_quantiles(y_child, y_parents, nb_quantiles)
#Question 5 :
cd = conditional_distributions(sample, nb_quantiles)
#plot_conditional_distributions(pj, cd, nb_quantiles) #prend beaucoup de temps
c_i_child = 5
c_i_parent = 8
p = proba_cond(c_i_parent, c_i_child, cd)
print("\nP(c_i_parent = {} | c_i_child = {}, pj = {}) = {}".format(c_i_parent,
c_i_child,pj, p))
P(c_i_parent = 8 | c_i_child = 5, pj = 0.9) = 0.108 Wall time: 154 ms
pj_2 = 0.9
nb_quantiles_2 = 10
n_2 = 1000*nb_quantiles_2
y_child_2, y_parents_2 = generate_incomes(n_2, pj_2)
sample_2 = compute_quantiles(y_child_2, y_parents_2, nb_quantiles_2)
cd_2 = conditional_distributions(sample_2, nb_quantiles_2)
plot_conditional_distributions(pj_2, cd_2, nb_quantiles_2) # Cette instruction prendra du temps si nb_quantiles > 10
pj_3 = 0.1
nb_quantiles_3 = 10
n_3 = 1000*nb_quantiles_3
y_child_3, y_parents_3 = generate_incomes(n_3, pj_3)
sample_3 = compute_quantiles(y_child_3, y_parents_3, nb_quantiles_3)
cd_3 = conditional_distributions(sample_3, nb_quantiles_3)
plot_conditional_distributions(pj_3, cd_3, nb_quantiles_3) # Cette instruction prendra du temps si nb_quantiles > 10
#On supprime les individus créés, on ne garde que les distributions conditionnelles "cd"
del pj, nb_quantiles, n, y_child, y_parents, sample, c_i_child, c_i_parent, p, cd
#Multiplication par 500
data_cloned = pd.concat([WID]*500, ignore_index=True)
print('WID shape :', WID.shape)
print('data_cloned shape :', data_cloned.shape)
WID shape : (10800, 6) data_cloned shape : (5400000, 6)
for col in data_cloned.columns:
print(col)
country quantile income income_mean moy_gini elasticite_ok
data_cloned.head(2)
| country | quantile | income | income_mean | moy_gini | elasticite_ok | |
|---|---|---|---|---|---|---|
| 0 | Bangladesh | 1 | 276.93503 | 998.692409 | 33.3 | 0.538 |
| 1 | Bangladesh | 2 | 332.78653 | 998.692409 | 33.3 | 0.538 |
data_cloned = data_cloned[["country", "quantile", "income", "moy_gini", "elasticite_ok"]]
data_cloned.rename(columns={'quantile': 'c_i_child', 'income': 'y_child', 'moy_gini': 'G_j', 'elasticite_ok': 'p_j'}, inplace=True)
data_cloned.head(2)
| country | c_i_child | y_child | G_j | p_j | |
|---|---|---|---|---|---|
| 0 | Bangladesh | 1 | 276.93503 | 33.3 | 0.538 |
| 1 | Bangladesh | 2 | 332.78653 | 33.3 | 0.538 |
data_cloned.sample(5)
| country | c_i_child | y_child | G_j | p_j | |
|---|---|---|---|---|---|
| 336662 | Finland | 63 | 16394.93600 | 27.785714 | 0.112876 |
| 4639400 | Malaysia | 1 | 589.08760 | 45.566667 | 0.540000 |
| 2448614 | Madagascar | 15 | 113.56046 | 41.966667 | 0.689613 |
| 2120007 | Luxembourg | 8 | 10340.48300 | 30.885714 | 0.380792 |
| 4908897 | China | 98 | 9382.28400 | 41.150000 | 0.399000 |
country_list = data_cloned['country'].unique()
len(country_list)
108
# creation d'un variable quantile parents, à remplir
list_proba = []
%%time
#Pour chaque pays dans la liste
for country in country_list :
#coef d'elasticité pour chaque pays de data_cloned dans la liste country_list, premiere ligne
pj = data_cloned.loc[data_cloned['country'] == country,'p_j'].iloc[0]
#nombre de quantiles (nombre de classes de revenu)
nb_quantiles = 100
#taille de l'échantillon
n = 50000
#Génération de revenus selons une loi normale
y_child, y_parents = generate_incomes(n, pj)
#Retourne un df avec y_child, y_parents, c_i_child, c_i_parents
sample = compute_quantiles(y_child, y_parents, nb_quantiles)
#Calcul & attribution des probabilités conditonnelles
cd = conditional_distributions(sample, nb_quantiles)
#On compte chaque combinaison c_i_child, c_i_parent
for c_i_child in range(100):
for c_i_parent in range(100):
p = proba_cond(c_i_parent, c_i_child, cd)
#print("\nP(c_i_parent = {} | c_i_child = {}, pj = {}) = {}".format(c_i_parent, c_i_child,pj,p))
#Association des probas conditionelles aux individus
list_proba.extend([c_i_parent+1]*(int(p*500)))
Wall time: 5min 27s
len(sample)
50000
sample.head()
| y_child | y_parents | c_i_child | c_i_parent | |
|---|---|---|---|---|
| 0 | 1.759885 | 2.971491 | 69 | 87 |
| 1 | 1.076300 | 0.528164 | 53 | 27 |
| 2 | 0.362200 | 0.842355 | 19 | 44 |
| 3 | 1.198692 | 0.952023 | 56 | 48 |
| 4 | 1.274805 | 10.589671 | 59 | 100 |
# Mesure de la mobilité
p
0.112
# Je check que ma liste à le meme nombre de ligne que mon data_cloned
len(list_proba)
5400000
# Je créé une colonne dans mon data_cloned pour la classe parents
data_cloned['proba'] = list_proba
data_cloned = data_cloned.rename({'proba':'c_i_parent'},axis=1)
data_cloned.head()
| country | c_i_child | y_child | G_j | p_j | c_i_parent | |
|---|---|---|---|---|---|---|
| 0 | Bangladesh | 1 | 276.93503 | 33.3 | 0.538 | 1 |
| 1 | Bangladesh | 2 | 332.78653 | 33.3 | 0.538 | 1 |
| 2 | Bangladesh | 3 | 359.20620 | 33.3 | 0.538 | 1 |
| 3 | Bangladesh | 4 | 377.34985 | 33.3 | 0.538 | 1 |
| 4 | Bangladesh | 5 | 392.61746 | 33.3 | 0.538 | 1 |
# Insertion du revenu moyen par pays
mean_income = data_cloned.groupby(by='country').mean()
mean_income.reset_index(inplace=True)
mean_income = mean_income[['country', 'y_child']]
mean_income.rename(columns={'y_child': 'm_j'}, inplace=True)
mean_income.head(2)
| country | m_j | |
|---|---|---|
| 0 | Albania | 2994.829902 |
| 1 | Argentina | 5847.884654 |
mean_income = WID[["country", "income_mean"]].drop_duplicates().reset_index() mean_income.rename(columns={'income_mean': 'm_j'}, inplace=True) mean_income = mean_income[["country", "m_j"]] mean_income.head()
# Jointure de mean_income avec data_cloned & suppression de c_i_child
data_cloned_b = pd.merge(data_cloned, mean_income, on='country')
data_cloned_b = data_cloned_b[["country", "y_child", "G_j", "p_j", "c_i_parent", "m_j"]]
# Insertion de colonnes log de y_child et m_j
data_cloned_b["income_log"] = np.log(data_cloned_b["y_child"])
data_cloned_b["m_j_log"] = np.log(data_cloned_b["m_j"])
data_cloned_b.loc[data_cloned_b["country"] == "France"]
| country | y_child | G_j | p_j | c_i_parent | m_j | income_log | m_j_log | |
|---|---|---|---|---|---|---|---|---|
| 950000 | France | 2958.3040 | 31.142857 | 0.357105 | 49 | 18309.407545 | 7.992371 | 9.81517 |
| 950001 | France | 4412.6753 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.392236 | 9.81517 |
| 950002 | France | 4939.6350 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.505047 | 9.81517 |
| 950003 | France | 5422.9165 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.598389 | 9.81517 |
| 950004 | France | 5906.5376 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.683815 | 9.81517 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 999995 | France | 39873.5100 | 31.142857 | 0.357105 | 59 | 18309.407545 | 10.593467 | 9.81517 |
| 999996 | France | 43693.0350 | 31.142857 | 0.357105 | 59 | 18309.407545 | 10.684944 | 9.81517 |
| 999997 | France | 49489.5820 | 31.142857 | 0.357105 | 60 | 18309.407545 | 10.809517 | 9.81517 |
| 999998 | France | 60758.6130 | 31.142857 | 0.357105 | 60 | 18309.407545 | 11.014664 | 9.81517 |
| 999999 | France | 122775.1640 | 31.142857 | 0.357105 | 60 | 18309.407545 | 11.718110 | 9.81517 |
50000 rows × 8 columns
data_cloned_b.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5400000 entries, 0 to 5399999 Data columns (total 8 columns): # Column Dtype --- ------ ----- 0 country object 1 y_child float64 2 G_j float64 3 p_j float64 4 c_i_parent int64 5 m_j float64 6 income_log float64 7 m_j_log float64 dtypes: float64(6), int64(1), object(1) memory usage: 370.8+ MB
# Sauvegarde du dataframe
#data_cloned_b.to_csv("données/data_cloned_b.csv", index=False)
data_cloned_b.sample(5)
| country | y_child | G_j | p_j | c_i_parent | m_j | income_log | m_j_log | |
|---|---|---|---|---|---|---|---|---|
| 1159123 | Hungary | 3833.15650 | 29.660000 | 0.400000 | 37 | 6101.341229 | 8.251444 | 8.716264 |
| 37965 | Bangladesh | 979.83950 | 33.300000 | 0.538000 | 73 | 998.692409 | 6.887389 | 6.906447 |
| 2072725 | Spain | 7964.57700 | 33.371429 | 0.423757 | 22 | 13116.992910 | 8.982759 | 9.481664 |
| 4368728 | Uganda | 489.88907 | 43.700000 | 1.029195 | 66 | 987.206289 | 6.194179 | 6.894879 |
| 1248167 | Iceland | 27973.72900 | 29.216667 | 0.400000 | 85 | 26888.511518 | 10.239021 | 10.199454 |
echantillon_m4 = data_cloned_b.loc[(data_cloned_b["country"] == "France")
| (data_cloned_b["country"] == "Chile")
| (data_cloned_b["country"] == "Iceland")
| (data_cloned_b["country"] == "Paraguay")
| (data_cloned_b["country"] == "Vietnam")
| (data_cloned_b["country"] == "Congo, Dem. Rep.")
]
echantillon_m4
| country | y_child | G_j | p_j | c_i_parent | m_j | income_log | m_j_log | |
|---|---|---|---|---|---|---|---|---|
| 950000 | France | 2958.3040 | 31.142857 | 0.357105 | 49 | 18309.407545 | 7.992371 | 9.815170 |
| 950001 | France | 4412.6753 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.392236 | 9.815170 |
| 950002 | France | 4939.6350 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.505047 | 9.815170 |
| 950003 | France | 5422.9165 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.598389 | 9.815170 |
| 950004 | France | 5906.5376 | 31.142857 | 0.357105 | 50 | 18309.407545 | 8.683815 | 9.815170 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5149995 | Paraguay | 10326.7880 | 53.533333 | 0.660000 | 100 | 3278.080965 | 9.242497 | 8.095013 |
| 5149996 | Paraguay | 11809.6210 | 53.533333 | 0.660000 | 100 | 3278.080965 | 9.376670 | 8.095013 |
| 5149997 | Paraguay | 13770.1250 | 53.533333 | 0.660000 | 100 | 3278.080965 | 9.530257 | 8.095013 |
| 5149998 | Paraguay | 17849.6070 | 53.533333 | 0.660000 | 100 | 3278.080965 | 9.789737 | 8.095013 |
| 5149999 | Paraguay | 43296.3830 | 53.533333 | 0.660000 | 100 | 3278.080965 | 10.675824 | 8.095013 |
300000 rows × 8 columns
df = echantillon_m4
fig = px.box(df, x="y_child", y="country", color="country")
fig.show()
print("Les distributions des revenus des individus varient fortement entre les pays dans l'échantillon choisi")
Les distributions des revenus des individus varient fortement entre les pays dans l'échantillon choisi
df = echantillon_m4
fig = px.box(df, x="income_log", y="country", color="country")
fig.show()
print("Les distributions des revenus en log des individus varient fortement entre les pays dans l'échantillon choisi")
Les distributions des revenus en log des individus varient fortement entre les pays dans l'échantillon choisi
plt.figure(figsize=(30,5))
rs = plt.plot(data_cloned_b.groupby('country').y_child.mean(), 'o')
rs = plt.xticks(rotation=90)
plt.xlabel('pays')
plt.ylabel('revenu moyen des enfants')
Text(0, 0.5, 'revenu moyen des enfants')
plt.figure(figsize=(30,5))
rs = plt.plot(data_cloned_b.groupby('country').income_log.mean(), 'o')
rs = plt.xticks(rotation=90)
plt.xlabel('pays')
plt.ylabel('revenu moyen des enfants')
Text(0, 0.5, 'revenu moyen des enfants')
# Agrégation pour gagner en temps de calcul
data_cloned_c = data_cloned_b.groupby(by=['country',
'y_child',
'm_j',
'G_j',
'p_j',
'm_j_log',
'income_log']).mean()
data_cloned_c.reset_index(inplace=True)
#data_projet7_m4_2.drop(columns=['c_i_parent'], inplace=True) # variable non necessaire pour la suite
data_cloned_c.c_i_parent = data_cloned_c.c_i_parent.round()
print('data_cloned_c shape :', data_cloned_c.shape)
print('data_cloned_b shape :', data_cloned_b.shape)
data_cloned_c shape : (10800, 8) data_cloned_b shape : (5400000, 8)
data_cloned_c.sample(5)
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | |
|---|---|---|---|---|---|---|---|---|
| 6276 | Luxembourg | 30899.0800 | 25217.562681 | 30.885714 | 0.380792 | 10.135296 | 10.338482 | 55.0 |
| 1012 | Bosnia and Herzegovina | 2452.1714 | 6334.687311 | 32.366667 | 0.827252 | 8.753796 | 7.804729 | 43.0 |
| 9407 | Tajikistan | 805.2709 | 2069.852224 | 32.000000 | 0.400000 | 7.635232 | 6.691179 | 43.0 |
| 6093 | Liberia | 1357.1670 | 615.007297 | 36.500000 | 0.660000 | 6.421634 | 7.213155 | 59.0 |
| 6554 | Malaysia | 4437.6973 | 6006.342359 | 45.566667 | 0.540000 | 8.700571 | 8.397891 | 51.0 |
%%time
anova_pays_rar = smf.ols('y_child~country', data=data_cloned_c).fit()
print(anova_pays_rar.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y_child R-squared: 0.500
Model: OLS Adj. R-squared: 0.495
Method: Least Squares F-statistic: 99.97
Date: Fri, 06 May 2022 Prob (F-statistic): 0.00
Time: 17:55:38 Log-Likelihood: -1.1057e+05
No. Observations: 10800 AIC: 2.214e+05
Df Residuals: 10692 BIC: 2.221e+05
Df Model: 107
Covariance Type: nonrobust
=======================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------------
Intercept 2994.8299 679.584 4.407 0.000 1662.718 4326.941
country[T.Argentina] 2853.0548 961.077 2.969 0.003 969.165 4736.945
country[T.Armenia] -1366.4471 961.077 -1.422 0.155 -3250.337 517.443
country[T.Austria] 1.364e+04 961.077 14.195 0.000 1.18e+04 1.55e+04
country[T.Azerbaijan] -637.4009 961.077 -0.663 0.507 -2521.291 1246.489
country[T.Bangladesh] -1996.1375 961.077 -2.077 0.038 -3880.028 -112.247
country[T.Belarus] 926.3336 961.077 0.964 0.335 -957.556 2810.224
country[T.Belgium] 1.203e+04 961.077 12.517 0.000 1.01e+04 1.39e+04
country[T.Bhutan] -1478.9008 961.077 -1.539 0.124 -3362.791 404.989
country[T.Bolivia] 21.4339 961.077 0.022 0.982 -1862.456 1905.324
country[T.Bosnia and Herzegovina] 3339.8574 961.077 3.475 0.001 1455.967 5223.748
country[T.Brazil] 1812.6547 961.077 1.886 0.059 -71.235 3696.545
country[T.Bulgaria] 1990.1487 961.077 2.071 0.038 106.259 3874.039
country[T.Burkina Faso] -2076.8772 961.077 -2.161 0.031 -3960.767 -192.987
country[T.Cameroon] -1200.3359 961.077 -1.249 0.212 -3084.226 683.554
country[T.Canada] 2.074e+04 961.077 21.585 0.000 1.89e+04 2.26e+04
country[T.Central African Republic] -2183.5300 961.077 -2.272 0.023 -4067.420 -299.640
country[T.Chile] 4056.7801 961.077 4.221 0.000 2172.890 5940.670
country[T.China] -472.0712 961.077 -0.491 0.623 -2355.961 1411.819
country[T.Colombia] 552.1754 961.077 0.575 0.566 -1331.715 2436.065
country[T.Congo, Dem. Rep.] -2718.8139 961.077 -2.829 0.005 -4602.704 -834.924
country[T.Costa Rica] 2585.5567 961.077 2.690 0.007 701.667 4469.447
country[T.Cote d'Ivoire] -2594.9947 961.077 -2.700 0.007 -4478.885 -711.105
country[T.Cyprus] 1.435e+04 961.077 14.932 0.000 1.25e+04 1.62e+04
country[T.Czech Republic] 5240.4635 961.077 5.453 0.000 3356.573 7124.354
country[T.Denmark] 1.405e+04 961.077 14.617 0.000 1.22e+04 1.59e+04
country[T.Dominican Republic] 563.5722 961.077 0.586 0.558 -1320.318 2447.462
country[T.Ecuador] 388.9111 961.077 0.405 0.686 -1494.979 2272.801
country[T.Egypt, Arab Rep.] -1025.2525 961.077 -1.067 0.286 -2909.143 858.638
country[T.El Salvador] -139.6051 961.077 -0.145 0.885 -2023.495 1744.285
country[T.Estonia] 4707.2327 961.077 4.898 0.000 2823.343 6591.123
country[T.Eswatini] -2464.5461 961.077 -2.564 0.010 -4348.436 -580.656
country[T.Fiji] -896.0990 961.077 -0.932 0.351 -2779.989 987.791
country[T.Finland] 1.331e+04 961.077 13.851 0.000 1.14e+04 1.52e+04
country[T.France] 1.531e+04 961.077 15.935 0.000 1.34e+04 1.72e+04
country[T.Georgia] -1631.0713 961.077 -1.697 0.090 -3514.961 252.819
country[T.Germany] 1.507e+04 961.077 15.677 0.000 1.32e+04 1.7e+04
country[T.Ghana] -2258.2274 961.077 -2.350 0.019 -4142.117 -374.337
country[T.Greece] 8732.4444 961.077 9.086 0.000 6848.554 1.06e+04
country[T.Guatemala] -852.3551 961.077 -0.887 0.375 -2736.245 1031.535
country[T.Guinea] -2298.8179 961.077 -2.392 0.017 -4182.708 -414.928
country[T.Honduras] 301.4385 961.077 0.314 0.754 -1582.452 2185.329
country[T.Hungary] 3106.5113 961.077 3.232 0.001 1222.621 4990.401
country[T.Iceland] 2.389e+04 961.077 24.861 0.000 2.2e+04 2.58e+04
country[T.India] -2070.5604 961.077 -2.154 0.031 -3954.451 -186.670
country[T.Indonesia] -1660.2116 961.077 -1.727 0.084 -3544.102 223.678
country[T.Iran, Islamic Rep.] 2837.8256 961.077 2.953 0.003 953.935 4721.716
country[T.Iraq] -1289.3176 961.077 -1.342 0.180 -3173.208 594.573
country[T.Ireland] 1.472e+04 961.077 15.312 0.000 1.28e+04 1.66e+04
country[T.Israel] 8105.4882 961.077 8.434 0.000 6221.598 9989.378
country[T.Italy] 1.193e+04 961.077 12.414 0.000 1e+04 1.38e+04
country[T.Japan] 1.444e+04 961.077 15.023 0.000 1.26e+04 1.63e+04
country[T.Jordan] 53.8011 961.077 0.056 0.955 -1830.089 1937.691
country[T.Kazakhstan] -755.6801 961.077 -0.786 0.432 -2639.570 1128.210
country[T.Kenya] -2475.5099 961.077 -2.576 0.010 -4359.400 -591.620
country[T.Korea, Rep.] 1.223e+04 961.077 12.728 0.000 1.03e+04 1.41e+04
country[T.Kosovo] -818.5609 961.077 -0.852 0.394 -2702.451 1065.329
country[T.Kyrgyz Republic] -1221.6107 961.077 -1.271 0.204 -3105.501 662.279
country[T.Lao PDR] -1991.4225 961.077 -2.072 0.038 -3875.313 -107.532
country[T.Latvia] 3769.6447 961.077 3.922 0.000 1885.755 5653.535
country[T.Liberia] -2379.8226 961.077 -2.476 0.013 -4263.713 -495.933
country[T.Lithuania] 3628.8267 961.077 3.776 0.000 1744.937 5512.717
country[T.Luxembourg] 2.222e+04 961.077 23.123 0.000 2.03e+04 2.41e+04
country[T.Madagascar] -2649.5928 961.077 -2.757 0.006 -4533.483 -765.703
country[T.Malawi] -2080.5645 961.077 -2.165 0.030 -3964.455 -196.674
country[T.Malaysia] 3011.5125 961.077 3.133 0.002 1127.622 4895.403
country[T.Mali] -2313.7549 961.077 -2.407 0.016 -4197.645 -429.865
country[T.Mauritania] -1196.2208 961.077 -1.245 0.213 -3080.111 687.669
country[T.Mexico] 891.0004 961.077 0.927 0.354 -992.890 2774.890
country[T.Moldova] -845.6585 961.077 -0.880 0.379 -2729.549 1038.232
country[T.Mongolia] -656.7425 961.077 -0.683 0.494 -2540.633 1227.148
country[T.Morocco] -657.2293 961.077 -0.684 0.494 -2541.119 1226.661
country[T.Mozambique] -2302.3494 961.077 -2.396 0.017 -4186.240 -418.459
country[T.Nepal] -2080.4383 961.077 -2.165 0.030 -3964.328 -196.548
country[T.Netherlands] 1.473e+04 961.077 15.331 0.000 1.28e+04 1.66e+04
country[T.Nicaragua] -475.4966 961.077 -0.495 0.621 -2359.387 1408.393
country[T.Niger] -2344.7005 961.077 -2.440 0.015 -4228.591 -460.810
country[T.Nigeria] -2302.9745 961.077 -2.396 0.017 -4186.865 -419.084
country[T.Norway] 1.949e+04 961.077 20.278 0.000 1.76e+04 2.14e+04
country[T.Pakistan] -2106.9906 961.077 -2.192 0.028 -3990.881 -223.101
country[T.Panama] 2140.3095 961.077 2.227 0.026 256.419 4024.200
country[T.Paraguay] 283.2511 961.077 0.295 0.768 -1600.639 2167.141
country[T.Peru] 335.7034 961.077 0.349 0.727 -1548.187 2219.594
country[T.Philippines] -1520.7670 961.077 -1.582 0.114 -3404.657 363.123
country[T.Poland] 2746.8903 961.077 2.858 0.004 863.000 4630.780
country[T.Portugal] 7103.8455 961.077 7.392 0.000 5219.955 8987.736
country[T.Romania] 323.5073 961.077 0.337 0.736 -1560.383 2207.397
country[T.Slovak Republic] 3101.7500 961.077 3.227 0.001 1217.860 4985.640
country[T.Slovenia] 9111.1776 961.077 9.480 0.000 7227.287 1.1e+04
country[T.South Africa] 2623.0750 961.077 2.729 0.006 739.185 4506.965
country[T.Spain] 1.012e+04 961.077 10.532 0.000 8238.273 1.2e+04
country[T.Sri Lanka] -1116.8897 961.077 -1.162 0.245 -3000.780 767.000
country[T.Sweden] 1.319e+04 961.077 13.724 0.000 1.13e+04 1.51e+04
country[T.Syrian Arab Republic] -2309.0124 961.077 -2.403 0.016 -4192.903 -425.122
country[T.Tajikistan] -924.9777 961.077 -0.962 0.336 -2808.868 958.912
country[T.Tanzania] -2406.0629 961.077 -2.504 0.012 -4289.953 -522.173
country[T.Thailand] -251.1173 961.077 -0.261 0.794 -2135.007 1632.773
country[T.Timor-Leste] -2267.2195 961.077 -2.359 0.018 -4151.110 -383.329
country[T.Turkey] 3055.6354 961.077 3.179 0.001 1171.745 4939.526
country[T.Uganda] -2007.6236 961.077 -2.089 0.037 -3891.514 -123.734
country[T.Ukraine] 354.5584 961.077 0.369 0.712 -1529.332 2238.449
country[T.United Kingdom] 1.871e+04 961.077 19.473 0.000 1.68e+04 2.06e+04
country[T.United States] 2.251e+04 961.077 23.420 0.000 2.06e+04 2.44e+04
country[T.Uruguay] 2295.9550 961.077 2.389 0.017 412.065 4179.845
country[T.Venezuela, RB] 172.3178 961.077 0.179 0.858 -1711.572 2056.208
country[T.Vietnam] -1617.0559 961.077 -1.683 0.092 -3500.946 266.834
country[T.West Bank and Gaza] -1880.7314 961.077 -1.957 0.050 -3764.621 3.159
country[T.Yemen, Rep.] -1952.1940 961.077 -2.031 0.042 -3836.084 -68.304
==============================================================================
Omnibus: 13716.488 Durbin-Watson: 0.679
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3991004.638
Skew: 6.801 Prob(JB): 0.00
Kurtosis: 96.187 Cond. No. 109.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Wall time: 470 ms
La pvalue du F test (test de Fisher) étant proche de 0, on peut au niveau de test de 5% rejeter H0 et admettre H1: ainsi on peut conclure que le pays d'origine influe bien sur les revenus. </br> Par ailleurs, d'après ce modèle, le pays d'origine explique 50% de la variance du revenu.
Test 1 : Distribution des résidus </br> On vérifie l'adéquation de la distribution des résidus de l'ANOVA à une loi normale. l'hypothèse nulle H0 étant que les résidus suivent une loi normale, on procède à un test de Kolmogorov-Smirnov sur les résidus.
x = anova_pays_rar.resid
stats.kstest(x, 'norm')
KstestResult(statistic=0.6575354154101319, pvalue=0.0)
HO : les données suivent une loi normale </br> H1 : les données ne suivent pas une loi normale </br> pvalue < 0,05 : on rejette H0, les données ne suivent pas une loi normale
Le KS test affichant une pvalue nulle, les données ne suivent pas une loi normale. </br> On observe la distribution des résidus sur une droite de Henry.
stats.probplot(x, dist="norm", plot=pylab)
pylab.show()
On a graphiquement une part élevée de résidus s'éloignant de la droite théorique, donc on peut dire que les résidus ne suivent vraisemblablement pas une loi normale.
Test 2 : Homoscédasticité </br> On observe la variance des résidus. HO l'hypothèse nulle d'hétéroscédasticité des résidus, on réalise un test de Breusch-Pagan sur les résidus.
H0 : homoscédasticité H1 : hétéroscédasticité
print(sm.stats.diagnostic.het_breuschpagan(anova_pays_rar.resid, anova_pays_rar.model.exog))
(393.6532268517739, 2.1114974213686448e-34, 3.7799903775757215, 3.012886529018643e-35)
Avec une p value proche de 0 (seconde valeur), on admet l'hétéroscédasticité. </br> On peut observer les variances des résidus sur le nuage de variance résiduelle
ax=plt.plot(anova_pays_rar.fittedvalues, anova_pays_rar.resid , ".", alpha=0.3)
plt.title("Nuage de la variance résiduelle", fontsize=18)
#plt.xlabel("GWh", fontsize=16), plt.ylabel("Résidus", fontsize=16)
Text(0.5, 1.0, 'Nuage de la variance résiduelle')
On peut confirmer la non linéarité avec un test de Rainbow, qui vérifie H0 : la représentation statistique est bien linéaire. </br> La p-value renvoyée par ce test devrait donc être supérieure à 0,05 pour qu'on considère que le modèle de régression peut être conservé. </br> Ici, la p value étant inférieure au seuil, on rejette H0.
from statsmodels.stats.diagnostic import linear_rainbow
Ftest, pval = linear_rainbow(anova_pays_rar)
print(pval)
0.024397832554924188
En conclusion, si ce modèle explique bien 50% de la variance du revenu des individus avec le pays d'origine, il ne semble pas être robuste aux tests usuels et ne peut donc être utilisé pour une prédiction pertinente.
# Variables explicatives : revenu moyen du pays de l’individu et l’indice de Gini du pays de l’individu
reg_v1_1 = smf.ols('y_child ~ G_j+m_j', data=data_cloned_c).fit()
print(reg_v1_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y_child R-squared: 0.500
Model: OLS Adj. R-squared: 0.500
Method: Least Squares F-statistic: 5401.
Date: Fri, 06 May 2022 Prob (F-statistic): 0.00
Time: 17:55:39 Log-Likelihood: -1.1057e+05
No. Observations: 10800 AIC: 2.211e+05
Df Residuals: 10797 BIC: 2.212e+05
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -4.535e-11 351.153 -1.29e-13 1.000 -688.324 688.324
G_j -1.002e-12 8.127 -1.23e-13 1.000 -15.931 15.931
m_j 1.0000 0.010 95.932 0.000 0.980 1.020
==============================================================================
Omnibus: 13716.488 Durbin-Watson: 0.679
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3991004.638
Skew: 6.801 Prob(JB): 0.00
Kurtosis: 96.187 Cond. No. 4.90e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.9e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
reg_v1_1_sct = reg_v1_1.centered_tss
reg_v1_1_sce = reg_v1_1.ess
reg_v1_1_scr = reg_v1_1.ssr
reg_v1_1_r2 = reg_v1_1.rsquared
display(Markdown(f"""
Décomposition de la variance : <br>
**SCT = SCE + SCR**
- SCT = {reg_v1_1_sct:.0f} <br>
- SCE = {reg_v1_1_sce:.0f} <br>
- SCR = {reg_v1_1_scr:.0f} <br>
$R^2 = {reg_v1_1_r2:.2f}$
Le modèle explique **{reg_v1_1_r2*100:.0f}% de la variance totale**.
"""))
Décomposition de la variance :
SCT = SCE + SCR
$R^2 = 0.50$
Le modèle explique 50% de la variance totale.
Selon ce modèle, le pays de naissance (ie. le revenu moyen et l’indice de Gini) explique donc 50% de la variance totale, tandis que les autres facteurs non considérés dans le modèle (efforts, chance, etc.) représentent l'autre moitié. Par ailleurs, l'indice de Gini n'est pas statistiquement significatif au seuil de 0,05.
# Variables explicatives : revenu moyen du pays de l’individu en log et l’indice de Gini du pays de l’individu
reg_v1_2 = smf.ols('income_log ~ G_j+m_j_log', data=data_cloned_c).fit()
reg_v1_2_sct = reg_v1_2.centered_tss
reg_v1_2_sce = reg_v1_2.ess
reg_v1_2_scr = reg_v1_2.ssr
reg_v1_2_r2 = reg_v1_2.rsquared
display(Markdown(f"""
Décomposition de la variance : <br>
**SCT = SCE + SCR**
- SCT = {reg_v1_2_sct:.0f} <br>
- SCE = {reg_v1_2_sce:.0f} <br>
- SCR = {reg_v1_2_scr:.0f} <br>
$R^2 = {reg_v1_2_r2:.2f}$
Le modèle explique **{reg_v1_2_r2*100:.0f}% de la variance totale**.
"""))
Décomposition de la variance :
SCT = SCE + SCR
$R^2 = 0.73$
Le modèle explique 73% de la variance totale.
Selon ce modèle, le pays de naissance (ie. le revenu moyen et l’indice de Gini) explique donc 73% de la variance totale, tandis que les autres facteurs non considérés dans le modèle (efforts, chance, etc.) représentent les 27% restant. Le passage en log des variables de revenus améliore le pouvoir explicatif du modèle, et l'indice de Gini devient significatif. Cependant, l'interprétation est rendue plus compliquée du fait des logs (pas de "lecture naturelle" posible).
# Variables explicatives : revenu moyen du pays de l’individu indice de Gini du pays de l’individu, classe de revenu des parents
reg_v2_1 = smf.ols('y_child ~ G_j+m_j+c_i_parent', data=data_cloned_c).fit()
print(reg_v2_1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y_child R-squared: 0.645
Model: OLS Adj. R-squared: 0.645
Method: Least Squares F-statistic: 6539.
Date: Fri, 06 May 2022 Prob (F-statistic): 0.00
Time: 17:55:39 Log-Likelihood: -1.0872e+05
No. Observations: 10800 AIC: 2.175e+05
Df Residuals: 10796 BIC: 2.175e+05
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -3.381e+04 589.071 -57.400 0.000 -3.5e+04 -3.27e+04
G_j 1.8736 6.849 0.274 0.784 -11.552 15.299
m_j 0.9982 0.009 113.627 0.000 0.981 1.015
c_i_parent 668.3549 10.068 66.385 0.000 648.620 688.090
==============================================================================
Omnibus: 15873.287 Durbin-Watson: 0.716
Prob(Omnibus): 0.000 Jarque-Bera (JB): 9136681.266
Skew: 8.791 Prob(JB): 0.00
Kurtosis: 144.402 Cond. No. 9.75e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.75e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
reg_v2_1_sct = reg_v2_1.centered_tss
reg_v2_1_sce = reg_v2_1.ess
reg_v2_1_scr = reg_v2_1.ssr
reg_v2_1_r2 = reg_v2_1.rsquared
display(Markdown(f"""
Décomposition de la variance : <br>
**SCT = SCE + SCR**
- SCT = {reg_v2_1_sct:.0f} <br>
- SCE = {reg_v2_1_sce:.0f} <br>
- SCR = {reg_v2_1_scr:.0f} <br>
$R^2 = {reg_v2_1_r2:.2f}$
Le modèle explique **{reg_v2_1_r2*100:.0f}% de la variance totale**.
"""))
Décomposition de la variance :
SCT = SCE + SCR
$R^2 = 0.65$
Le modèle explique 65% de la variance totale.
Selon ce modèle, le pays de naissance (ie. le revenu moyen, la classe de revenus des aprents, et l’indice de Gini) explique donc 65% de la variance totale, tandis que les autres facteurs non considérés dans le modèle (efforts, chance, etc.) représentent les 35% restant. </br> Cependant, et comme pour le premier modèle, l'indice de gini n'est pas statistiquement significatif au seuil de 0,05.
# Variables explicatives : revenu moyen du pays de l’individu en log,
# l'indice de Gini du pays de l’individu, classe de revenu des parents
reg_v2_2 = smf.ols('income_log ~ G_j+m_j_log+c_i_parent', data=data_cloned_c).fit()
reg_v2_2_sct = reg_v2_2.centered_tss
reg_v2_2_sce = reg_v2_2.ess
reg_v2_2_scr = reg_v2_2.ssr
reg_v2_2_r2 = reg_v2_2.rsquared
display(Markdown(f"""
Décomposition de la variance : <br>
**SCT = SCE + SCR**
- SCT = {reg_v2_2_sct:.0f} <br>
- SCE = {reg_v2_2_sce:.0f} <br>
- SCR = {reg_v2_2_scr:.0f} <br>
$R^2 = {reg_v2_2_r2:.4f}$
Le modèle explique **{reg_v2_2_r2*100:.0f}% de la variance totale**.
"""))
Décomposition de la variance :
SCT = SCE + SCR
$R^2 = 0.9593$
Le modèle explique 96% de la variance totale.
Le modèle le plus performant est celui expliquant le revenu des enfants en log par le revenu du pays en log, l'indice de gini et la classe de revenus des parents, avec un $R^2 = 0,96$.
On risque cependant de faire face à un problème de sur-ajustement du modèle, et une complication de l'interprétation du fait de l'utilisation des log.
print("Le R^2 ajusté est similaire, avec une valeur de ", round(reg_v2_2.rsquared_adj, 4))
Le R^2 ajusté est similaire, avec une valeur de 0.9593
#Récapitulatif :
R2 = [reg_v1_1_r2, reg_v1_2_r2,
reg_v2_1_r2, reg_v2_2_r2]#,reg_v3_1_r2, reg_v3_2_r2]
reg_name = ["Modèle 1", "Modèle 1_2",
"Modèle 2", "Modèle 2_2"]#,"Modèle 3", "Modèle 3_2"]
log = ["Non", "Oui", "Non", "Oui"]#, "Oui", "Oui"]
var = ['y_child ~ G_j+m_j',
'income_log ~ G_j+m_j_log',
'y_child ~ G_j+m_j+c_i_parent',
'income_log ~ G_j+m_j_log+c_i_parent']#,'income_log ~ m_j_log + G_j','income_log ~ m_j_log + G_j+c_i_parent']
list_of_tuples = list(zip(reg_name, var, R2, log))
list_of_tuples
df = pd.DataFrame(list_of_tuples,
columns = ['Modèle', 'Variables', 'R2', 'Logarithme'])
df.R2 = df.R2.round(2)
df
| Modèle | Variables | R2 | Logarithme | |
|---|---|---|---|---|
| 0 | Modèle 1 | y_child ~ G_j+m_j | 0.50 | Non |
| 1 | Modèle 1_2 | income_log ~ G_j+m_j_log | 0.73 | Oui |
| 2 | Modèle 2 | y_child ~ G_j+m_j+c_i_parent | 0.65 | Non |
| 3 | Modèle 2_2 | income_log ~ G_j+m_j_log+c_i_parent | 0.96 | Oui |
On va conserver le modèle 2_2, utilisant à la fois les revenus, l'indice de Gini et la classe de revenu des parents, avec des variables non log, qui permettent une interprétation facilitée. On pourra également aborder le modèle.
# Paramètres de l'étude
n = data_cloned_c.shape[0] # échantillon
p = 4 # nombre de variables
# Seuil levier selon Belsey
seuil_levier = 2 * p/n
seuil_levier
0.0007407407407407407
# Ajout des leviers
data_cloned_c['Leviers'] = reg_v2_2.get_influence().hat_matrix_diag
Leviers_sup = data_cloned_c.loc[data_cloned_c['Leviers'] > seuil_levier, :]
Leviers_inf = data_cloned_c.loc[data_cloned_c['Leviers'] <= seuil_levier, :]
Leviers_sup.head().style.format({"Leviers": "{:,.7f}"}) # permet d'afficher pour la colonne choisie le nombre de chiffre apres la virgules
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | |
|---|---|---|---|---|---|---|---|---|---|
| 900 | Bolivia | 20.584948 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 3.024560 | 41.000000 | 0.0008012 |
| 901 | Bolivia | 57.159256 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 4.045841 | 41.000000 | 0.0008012 |
| 902 | Bolivia | 85.552185 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 4.449127 | 41.000000 | 0.0008012 |
| 903 | Bolivia | 112.422820 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 4.722267 | 42.000000 | 0.0007453 |
| 904 | Bolivia | 143.889390 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 4.969045 | 42.000000 | 0.0007453 |
len(Leviers_sup['country'].unique())
9
On a donc 9 pays présentant des observations au dessus du seuil
%%time
# Représentation des leviers
plt.figure(figsize=(10,6))
# Individus sous le seuil
plt.bar(Leviers_inf['country'],Leviers_inf['Leviers'])
# Individus au dessus du seuil
plt.bar(Leviers_sup['country'],Leviers_sup['Leviers'])
# Décoration et annotations
plt.title('Représentation des leviers', fontsize=22)
plt.xlabel('Pays', fontsize=18)
plt.xticks('', fontsize=16)
plt.ylabel('Leviers', fontsize=18)
plt.yticks(fontsize=16)
plt.axhline(y=seuil_levier, linestyle='-')
plt.text(50, 0.00085 , 'Seuil levier', fontsize = '14', color='red')
plt.tight_layout()
plt.show()
Wall time: 14.7 s
country_lev = Leviers_sup['country'].unique()
country_lev
array(['Bolivia', 'Brazil', 'Central African Republic', 'Colombia',
'Congo, Dem. Rep.', 'Eswatini', 'Honduras', 'Panama',
'South Africa'], dtype=object)
Le seuil pour les résidus studentisés est une loi de Student à n-p-1 degrés de liberté
from scipy.stats import t, shapiro
alpha = 0.05
data_cloned_c['rstudent'] = reg_v2_2.get_influence().resid_studentized_internal
seuil_rstudent = t.ppf(1-alpha/2,n-p-1)
seuil_rstudent
1.960183765245542
# statistique de test par observation
data_cloned_c.sort_values(by='rstudent').head()
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5500 | Korea, Rep. | 17.317732 | 15227.569445 | 32.000000 | 0.394000 | 9.630863 | 2.851731 | 41.0 | 0.000558 | -19.186032 |
| 1500 | Canada | 122.424810 | 23739.640518 | 33.500000 | 0.269333 | 10.074902 | 4.807497 | 41.0 | 0.000651 | -13.713928 |
| 1800 | China | 16.719418 | 2522.758726 | 41.150000 | 0.399000 | 7.833108 | 2.816571 | 41.0 | 0.000382 | -12.544563 |
| 900 | Bolivia | 20.584948 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 3.024560 | 41.0 | 0.000801 | -11.551001 |
| 4900 | Israel | 142.350430 | 11100.318137 | 40.912500 | 0.660000 | 9.314729 | 4.958292 | 41.0 | 0.000510 | -10.119617 |
data_cloned_c.sort_values(by='rstudent').tail()
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | |
|---|---|---|---|---|---|---|---|---|---|---|
| 999 | Bolivia | 43786.920 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 10.687090 | 60.0 | 0.000807 | 7.296553 |
| 8999 | South Africa | 82408.550 | 5617.904880 | 61.866667 | 0.677000 | 8.633714 | 11.319444 | 60.0 | 0.001187 | 7.655733 |
| 4199 | Honduras | 56265.700 | 3296.268419 | 56.840000 | 0.660000 | 8.100546 | 10.937840 | 60.0 | 0.000814 | 7.874688 |
| 1699 | Central African Republic | 14472.948 | 811.299901 | 56.200000 | 0.660000 | 6.698638 | 9.580037 | 60.0 | 0.000786 | 7.901420 |
| 3999 | Guatemala | 39799.406 | 2142.474753 | 54.400000 | 1.015206 | 7.669717 | 10.591607 | 60.0 | 0.000677 | 8.008709 |
rstudent_in = data_cloned_c.loc[(data_cloned_c['rstudent'] <= seuil_rstudent) & (data_cloned_c['rstudent'] >= -seuil_rstudent), :]
rstudent_sup = data_cloned_c.loc[(data_cloned_c['rstudent'] > seuil_rstudent), :]
rstudent_inf = data_cloned_c.loc[(data_cloned_c['rstudent'] < -seuil_rstudent), :]
%%time
# Représentation des résidus studentisés
plt.figure(figsize=(10,10))
# Individus entre le seuil mini et le seuil maxi
plt.bar(rstudent_in['country'],rstudent_in['rstudent'], color='orange')
# Individus au dessus du seuil maxi
plt.bar(rstudent_sup['country'],rstudent_sup['rstudent'], color='steelblue')
# Individus en dessous du seuil maxi
plt.bar(rstudent_inf['country'],rstudent_inf['rstudent'], color='steelblue')
# Annotations
plt.title('Représentation des résidus studentisés', fontsize=22)
plt.xlabel('Pays', fontsize=18)
plt.xticks('', fontsize=16)
plt.ylim(-22,9)
plt.ylabel('Résidus studentisés', fontsize=18)
plt.yticks(fontsize=16)
plt.axhline(y=seuil_rstudent, color='steelblue', linestyle='-')
plt.text(50, 6.2 , 'Seuil rstudent', fontsize = '18', color='steelblue')
plt.axhline(y=-seuil_rstudent, color='steelblue', linestyle='-')
plt.text(50, -6.2 , '-Seuil rstudent', fontsize = '18', color='steelblue')
plt.tight_layout()
plt.show()
Wall time: 13.2 s
# Nombre de valeurs atypiques sur les variables à expliquer
res_stu_ln = data_cloned_c.loc[(data_cloned_c['rstudent'] > seuil_rstudent) | (data_cloned_c['rstudent'] < -seuil_rstudent)]
len(res_stu_ln)
384
# Pays presentants des valeurs atypiques
outliers_country = _country = data_cloned_c.loc[data_cloned_c.index.isin(res_stu_ln.index)].groupby(by='country').count().sort_values(by='y_child',ascending=False)
outliers_country
| y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | |
|---|---|---|---|---|---|---|---|---|---|
| country | |||||||||
| South Africa | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 |
| Honduras | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 |
| Bolivia | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 | 16 |
| Colombia | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 |
| Panama | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Kyrgyz Republic | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Lao PDR | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Mongolia | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Nepal | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Yemen, Rep. | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
94 rows × 9 columns
# Création du dataframe avec tous les resultats d'influences disponibles
influence_ln = reg_v2_2.get_influence().summary_frame()
influence_ln.head()
| dfb_Intercept | dfb_G_j | dfb_m_j_log | dfb_c_i_parent | cooks_d | standard_resid | hat_diag | dffits_internal | student_resid | dffits | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.005953 | 0.003293 | 0.001139 | 0.005735 | 0.000014 | -0.341010 | 0.000468 | -0.007376 | -0.340996 | -0.007376 |
| 1 | 0.008230 | -0.004553 | -0.001575 | -0.007929 | 0.000026 | 0.471494 | 0.000468 | 0.010199 | 0.471477 | 0.010198 |
| 2 | 0.006182 | -0.003673 | -0.001275 | -0.005728 | 0.000015 | 0.380626 | 0.000411 | 0.007722 | 0.380610 | 0.007721 |
| 3 | 0.010355 | -0.006153 | -0.002136 | -0.009595 | 0.000042 | 0.637550 | 0.000411 | 0.012934 | 0.637533 | 0.012933 |
| 4 | 0.012731 | -0.007564 | -0.002626 | -0.011796 | 0.000063 | 0.783833 | 0.000411 | 0.015901 | 0.783819 | 0.015901 |
# Ajout de la colonne distance de Cook à notre dataframe analyses_ln
data_cloned_c['dcooks'] = influence_ln['cooks_d']
# Seuil d'influence selon Cook
seuil_dcook = 4/(n-p)
data_cloned_c.sort_values(by='dcooks').head().style.format({"dcooks": "{:,.13f}"})
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | dcooks | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 839 | Bhutan | 962.803830 | 1515.929061 | 39.500000 | 0.500000 | 7.323784 | 6.869850 | 49.000000 | 0.000140 | -0.000076 | 0.0000000000002 |
| 9819 | Turkey | 2243.633800 | 6050.465331 | 40.642857 | 0.400000 | 8.707890 | 7.715852 | 45.000000 | 0.000227 | 0.000091 | 0.0000000000005 |
| 1445 | Cameroon | 1226.937600 | 1794.493964 | 42.450000 | 0.660000 | 7.492478 | 7.112277 | 50.000000 | 0.000124 | -0.000150 | 0.0000000000007 |
| 9133 | Sri Lanka | 1042.414200 | 1877.940244 | 39.950000 | 0.500000 | 7.537931 | 6.949295 | 48.000000 | 0.000132 | -0.000278 | 0.0000000000026 |
| 1040 | Bosnia and Herzegovina | 4383.593300 | 6334.687311 | 32.366667 | 0.827252 | 8.753796 | 8.385624 | 49.000000 | 0.000165 | 0.000324 | 0.0000000000043 |
data_cloned_c.sort_values(by='dcooks').tail().style.format({"dcooks": "{:,.7f}"})
| country | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | dcooks | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 8999 | South Africa | 82408.550000 | 5617.904880 | 61.866667 | 0.677000 | 8.633714 | 11.319444 | 60.000000 | 0.001187 | 7.655733 | 0.0174122 |
| 900 | Bolivia | 20.584948 | 3016.263843 | 56.877778 | 0.866268 | 8.011774 | 3.024560 | 41.000000 | 0.000801 | -11.551001 | 0.0267462 |
| 8900 | South Africa | 60.490383 | 5617.904880 | 61.866667 | 0.677000 | 8.633714 | 4.102484 | 41.000000 | 0.001181 | -9.615355 | 0.0273406 |
| 1500 | Canada | 122.424810 | 23739.640518 | 33.500000 | 0.269333 | 10.074902 | 4.807497 | 41.000000 | 0.000651 | -13.713928 | 0.0306344 |
| 5500 | Korea, Rep. | 17.317732 | 15227.569445 | 32.000000 | 0.394000 | 9.630863 | 2.851731 | 41.000000 | 0.000558 | -19.186032 | 0.0513742 |
cooks_sup = data_cloned_c.loc[(data_cloned_c['dcooks'] > seuil_dcook), :]
cooks_inf = data_cloned_c.loc[(data_cloned_c['dcooks'] <= seuil_dcook), :]
%%time
# Représentation de la distances de Cooks
plt.figure(figsize=(10,10))
# Individus sous le seuil
plt.bar(cooks_inf['country'],cooks_inf['dcooks'], color='blue')
# Individus au dessus le seuil
plt.bar(cooks_sup['country'],cooks_sup['dcooks'], color='orange')
# Décoration et annotations
plt.title('Représentation de la distance de Cook', fontsize=22)
plt.ylabel('Distance de Cook', fontsize=18)
plt.xticks('', fontsize=16)
plt.xticks(fontsize=16)
plt.text(-5, (seuil_dcook + 0.00300), 'Seuil de Cook', fontsize = '18', color='steelblue')
plt.axhline(y=seuil_dcook, color='steelblue', linestyle='-')
plt.tight_layout()
plt.show()
Wall time: 13.3 s
# Nombre d'observations influentes
dco_ln = data_cloned_c.loc[data_cloned_c['dcooks'] > seuil_dcook]
len(dco_ln)
516
# Pays présentants des obervations au dela du seuil
data_cloned_c.loc[data_cloned_c.index.isin(dco_ln.index)].groupby(by='country').count().sort_values(by='y_child',ascending=False)
| y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | dcooks | |
|---|---|---|---|---|---|---|---|---|---|---|
| country | ||||||||||
| South Africa | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 |
| Honduras | 30 | 30 | 30 | 30 | 30 | 30 | 30 | 30 | 30 | 30 |
| Bolivia | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 |
| Colombia | 19 | 19 | 19 | 19 | 19 | 19 | 19 | 19 | 19 | 19 |
| Panama | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 | 18 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Jordan | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Nepal | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Lao PDR | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Mongolia | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Albania | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
100 rows × 10 columns
# Obeservations atypiques et influentes
ind_aty_infl_ln = data_cloned_c.loc[((data_cloned_c['dcooks'] > seuil_dcook) &
(data_cloned_c['rstudent'] > seuil_rstudent) &
(data_cloned_c['Leviers'] > seuil_levier)) |
((data_cloned_c['dcooks'] > seuil_dcook) &
(data_cloned_c['rstudent'] < -seuil_rstudent) &
(data_cloned_c['Leviers'] > seuil_levier))]
len(ind_aty_infl_ln)
107
susp_pt = data_cloned_c.loc[data_cloned_c.index.isin(ind_aty_infl_ln.index)]
susp_pt.reset_index(inplace=True)
susp_pt.groupby('country').count()
| index | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | dcooks | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| country | |||||||||||
| Bolivia | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 |
| Brazil | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 |
| Central African Republic | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 |
| Colombia | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
| Congo, Dem. Rep. | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| Eswatini | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| Honduras | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 |
| Panama | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 |
| South Africa | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 |
susp_pt.groupby('country').count().sort_values(by = "index",ascending=False)
| index | y_child | m_j | G_j | p_j | m_j_log | income_log | c_i_parent | Leviers | rstudent | dcooks | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| country | |||||||||||
| South Africa | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 | 41 |
| Honduras | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 | 14 |
| Bolivia | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 | 13 |
| Brazil | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 |
| Panama | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 |
| Colombia | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
| Central African Republic | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 | 6 |
| Congo, Dem. Rep. | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
| Eswatini | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
from statsmodels.graphics.regressionplots import * influence_plot(reg_v2_2) # myreg doit être un model de statsmodels
plt.show()
variables = reg_v2_2.model.exog
[variance_inflation_factor(variables, i) for i in np.arange(1,variables.shape[1])]
[1.0941154281919516, 1.094130771502384, 1.0000625744665907]
_, pval, __, f_pval = sm.stats.diagnostic.het_breuschpagan(reg_v2_2.resid, variables)
print('p value test Breusch Pagan:', pval)
p value test Breusch Pagan: 4.721449008350039e-28
shapiro(reg_v2_2.resid)
C:\Users\mouly\anaconda3\lib\site-packages\scipy\stats\morestats.py:1681: UserWarning: p-value may not be accurate for N > 5000.
ShapiroResult(statistic=0.8224266767501831, pvalue=0.0)
stats.kstest(reg_v2_2.resid, 'norm')
KstestResult(statistic=0.31936833838779277, pvalue=0.0)
stats.probplot(reg_v2_2.resid, dist="norm", plot=pylab)
pylab.show()
plt.hist(reg_v2_2.resid)
(array([1.000e+00, 0.000e+00, 3.000e+00, 4.000e+00, 1.400e+01, 8.400e+01,
4.297e+03, 6.269e+03, 1.100e+02, 1.800e+01]),
array([-5.4120916 , -4.64498229, -3.87787298, -3.11076367, -2.34365435,
-1.57654504, -0.80943573, -0.04232642, 0.72478289, 1.4918922 ,
2.25900151]),
<BarContainer object of 10 artists>)
#Distribution des résidus
fig, ax = plt.subplots(1, 2, figsize=(20,10))
plt.hist(reg_v2_2.resid, density=True)
model_norm_residuals = reg_v2_2.get_influence().resid_studentized_internal
QQ = sm.ProbPlot(model_norm_residuals)
QQ.qqplot(line='45', alpha=0.5, color='#4C72B0', ax=ax[0])
ax[0].set_title('Q-Q Plot')
ax[1].set_title('Histogramme des résidus')
ax[1].set_xlabel('Valeurs résiduelles')
ax[1].set_ylabel('Nombre de résidus')
plt.show()
#Homoscédasticité
print(sm.stats.diagnostic.het_breuschpagan(reg_v2_2.resid, reg_v2_2.model.exog))
ax=plt.plot(reg_v2_2.fittedvalues, reg_v2_2.resid, ".", alpha=0.3)
plt.title("Nuage de la variance résiduelle", fontsize=18)
#plt.xlabel("GWh", fontsize=16), plt.ylabel("Résidus", fontsize=16)
from statsmodels.stats.diagnostic import linear_rainbow
Ftest, pval = linear_rainbow(reg_v2_2)
print(pval)
(130.27376846278395, 4.721449008350039e-28, 43.938509567599944, 3.2548335815251424e-28) 5.307216185485038e-08
country = data_cloned_c['country'].unique()
country
array(['Albania', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan',
'Bangladesh', 'Belarus', 'Belgium', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Burkina Faso',
'Cameroon', 'Canada', 'Central African Republic', 'Chile', 'China',
'Colombia', 'Congo, Dem. Rep.', 'Costa Rica', "Cote d'Ivoire",
'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic',
'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Estonia',
'Eswatini', 'Fiji', 'Finland', 'France', 'Georgia', 'Germany',
'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hungary',
'Iceland', 'India', 'Indonesia', 'Iran, Islamic Rep.', 'Iraq',
'Ireland', 'Israel', 'Italy', 'Japan', 'Jordan', 'Kazakhstan',
'Kenya', 'Korea, Rep.', 'Kosovo', 'Kyrgyz Republic', 'Lao PDR',
'Latvia', 'Liberia', 'Lithuania', 'Luxembourg', 'Madagascar',
'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mexico', 'Moldova',
'Mongolia', 'Morocco', 'Mozambique', 'Nepal', 'Netherlands',
'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Pakistan', 'Panama',
'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania',
'Slovak Republic', 'Slovenia', 'South Africa', 'Spain',
'Sri Lanka', 'Sweden', 'Syrian Arab Republic', 'Tajikistan',
'Tanzania', 'Thailand', 'Timor-Leste', 'Turkey', 'Uganda',
'Ukraine', 'United Kingdom', 'United States', 'Uruguay',
'Venezuela, RB', 'Vietnam', 'West Bank and Gaza', 'Yemen, Rep.'],
dtype=object)
pays_selectionné = 'France'
c_i_parent_target = 50.0
print('Le pays selectionné est :', pays_selectionné)
revenu_moyen_pays = data_cloned_c.loc[(data_cloned_c['country'] == pays_selectionné), 'm_j'].iloc[0]
print('Le revenu moyen du pays selectionné est de', round(revenu_moyen_pays, 2),"$")
revenu_moyen_pays_log = np.log(revenu_moyen_pays)
print('Le revenu moyen en log du pays selectionné est de', round(revenu_moyen_pays_log, 3))
indice_gini_pays = data_cloned_c.loc[(data_cloned_c['country'] == pays_selectionné), 'G_j'].iloc[0]
print("L'indice de gini du pays selectionné est de", round(indice_gini_pays))
a_prevoir = pd.DataFrame({'m_j_log':[revenu_moyen_pays_log], 'G_j':[indice_gini_pays],'c_i_parent':[c_i_parent_target]})
Revenu_enfant = reg_v2_2.predict(a_prevoir)
# Ajouter * devant une variable lors d'un print permet de ne pas afficher le Dtypes ni l'index
Revenu_enfant_calculé = np.exp(*Revenu_enfant)
Revenu_enfant_calculé
print("Le revenu d'un individu en",pays_selectionné,
"dont les parents ont une classe de revenu égale à",c_i_parent_target,
"est de",round(Revenu_enfant_calculé, 2),"$")
Le pays selectionné est : France Le revenu moyen du pays selectionné est de 18309.41 $ Le revenu moyen en log du pays selectionné est de 9.815 L'indice de gini du pays selectionné est de 31 Le revenu d'un individu en France dont les parents ont une classe de revenu égale à 50.0 est de 14324.41 $
f = data_cloned_c.loc[(data_cloned_c['country'] == pays_selectionné)]
g = f.loc[f["c_i_parent"] == c_i_parent_target]
h = g.y_child.mean()
print(round(h, 2))
14196.13